##Goals

Load in phyloseq data with rooted tree. Evaluate sequencing depth and remove sample. Normalize the read counts between samples. Calculate community dissimilarities. Numbers between 0 and 1. If 0, completely similar versus if they are 1, then they’re completely dissimilar. Sorensen: Shared Species as a binary value: Abundance-unweighted Bray-Curtis: Shared Abundant species: Abundance-weighted (Abundance-)Weighted UNIFRAC: Consider Abundant Species and where they fall on the tree Visualize the community data with two unconstrained Ordinations: PCoA: Linear Method. Eigenvalue = how much variation is explained by each axis. Choose to view axis 1, 2, 3, etc. and plot them together. NMDS: Non-linear. Smush multiple Dimensions into 2 or 3 axes. Need to report Stress value (ideally <0.15). Run statistics with PERMANOVA and betadispR.

##Set seed

set.seed(2443)

##Load libraries

#install.packages("vegan")
pacman::p_load(tidyverse, devtools, phyloseq, patchwork, vegan,
               install = FALSE)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Load Lakestie colors 
lakesite_colors <- c("110" = "dodgerblue4","M15" = "dodgerblue2","M45" = "#D9CC3C","MLB" = "#A0E0BA")

##Load Data

# Load in rooted phylogenetic tree! 
load("data/03_Phylogenetic_Tree/phytree_preprocessed_physeq.RData")

##Explore Read Counts

# Calculate the total number of reads per sample. 
raw_TotalSeqs_df_combined  <- 
  midroot_physeq %>%
  # calculate the sample read sums 
  sample_sums() %>%
  data.frame()
# name the column 
colnames(raw_TotalSeqs_df_combined)[1] <- "TotalSeqs"
  
raw_TotalSeqs_df_DNA  <- 
  midroot_physeq_DNA %>%
  # calculate the sample read sums 
  sample_sums() %>%
  data.frame()
# name the column 
colnames(raw_TotalSeqs_df_DNA)[1] <- "TotalSeqs"
  
raw_TotalSeqs_df_RNA  <- 
  midroot_physeq_RNA %>%
  # calculate the sample read sums 
  sample_sums() %>%
  data.frame()
# name the column 
colnames(raw_TotalSeqs_df_RNA)[1] <- "TotalSeqs"
  

head(raw_TotalSeqs_df_combined)
##            TotalSeqs
## 110D2W115D     13235
## 110D2W115R     12953
## 110D2W415D     14765
## 110D2W415R     11587
## 110D2W515D     18867
## 110D2W515R     12432
head(raw_TotalSeqs_df_DNA)
##            TotalSeqs
## 110D2W115D     13235
## 110D2W415D     14765
## 110D2W515D     18867
## 110D2W615D      9289
## 110D2W715D     14757
## 110D2W815D     36720
head(raw_TotalSeqs_df_RNA)
##            TotalSeqs
## 110D2W115R     12953
## 110D2W415R     11587
## 110D2W515R     12432
## 110D2W615R     16887
## 110D2W715R     14711
## 110D2W815R     10574
# make histogram of raw reads 
raw_TotalSeqs_df_combined %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Raw Combined Sequencing Depth Distribution") + 
  theme_classic()
## Warning: Removed 74 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

raw_TotalSeqs_df_DNA %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Raw DNA Sequencing Depth Distribution") + 
  theme_classic()
## Warning: Removed 43 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

raw_TotalSeqs_df_RNA %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Raw RNA Sequencing Depth Distribution") + 
  theme_classic()
## Warning: Removed 31 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#What are the minimum number of sequences for each rooted physeq object?
midroot_physeq %>% sample_sums() %>% min()
## [1] 2142
midroot_physeq_DNA %>% sample_sums() %>% min()
## [1] 2582
midroot_physeq_RNA %>% sample_sums() %>% min()
## [1] 2142
#The minimum number of sequences for DNA samples = 2582 and for RNA samples = 2142
#In order to normailze the read counts we first need to import a scale read function from http://deneflab.github.io/MicrobeMiseq/

matround <- function(x){trunc(x+0.5)}

scale_reads <- function(physeq, n = min(sample_sums(physeq)), round = "round") {
  
  # transform counts to n
  physeq.scale <- transform_sample_counts(physeq, function(x) {(n * x/sum(x))})
  
  # Pick the rounding functions
  if (round == "floor"){
    otu_table(physeq.scale) <- floor(otu_table(physeq.scale))
  } else if (round == "round"){
    otu_table(physeq.scale) <- round(otu_table(physeq.scale))
  } else if (round == "matround"){
    otu_table(physeq.scale) <- matround(otu_table(physeq.scale))
  }
  
  # Prune taxa and return new phyloseq object
  physeq.scale <- prune_taxa(taxa_sums(physeq.scale) > 0, physeq.scale)
  return(physeq.scale)
}
#Scale the reads using the above function, not we are scaling each object by their minimum number for sequences.

# Scale reads combined DNA/RNA by the above function
scaled_rooted_physeq <- 
   midroot_physeq %>%
  scale_reads(round = "matround")

# Scale reads DNA samples by the above function
scaled_rooted_physeq_DNA <- 
  midroot_physeq_DNA %>%
  scale_reads(round = "matround")

# Scale reads RNA samples by the above function
scaled_rooted_physeq_RNA <- 
  midroot_physeq_RNA %>%
  scale_reads(round = "matround")

# Calculate combined DNA//RNA read depth 
scaled_TotalSeqs_df <- 
  scaled_rooted_physeq %>%
  sample_sums() %>%
  data.frame()
colnames(scaled_TotalSeqs_df)[1] <-"TotalSeqs"

head(scaled_TotalSeqs_df) #Inspect to see if the scaling occured
##            TotalSeqs
## 110D2W115D      2143
## 110D2W115R      2130
## 110D2W415D      2137
## 110D2W415R      2140
## 110D2W515D      2146
## 110D2W515R      2147
# Calculate DNA samples read depth 
scaled_TotalSeqs_DNA_df <- 
  scaled_rooted_physeq_DNA %>%
  sample_sums() %>%
  data.frame()
colnames(scaled_TotalSeqs_DNA_df)[1] <-"TotalSeqs"

head(scaled_TotalSeqs_DNA_df) #Inspect to see if the scaling occured
##            TotalSeqs
## 110D2W115D      2581
## 110D2W415D      2583
## 110D2W515D      2584
## 110D2W615D      2586
## 110D2W715D      2583
## 110D2W815D      2568
# Calculate RNA samples read depth 
scaled_TotalSeqs_RNA_df <- 
  scaled_rooted_physeq_RNA %>%
  sample_sums() %>%
  data.frame()
colnames(scaled_TotalSeqs_RNA_df)[1] <-"TotalSeqs"

# Inspect
head(scaled_TotalSeqs_RNA_df)
##            TotalSeqs
## 110D2W115R      2130
## 110D2W415R      2140
## 110D2W515R      2147
## 110D2W615R      2138
## 110D2W715R      2142
## 110D2W815R      2135
#Check the range of the data 
#For Combined,DNA and RNA
min_seqs_combined <- min(scaled_TotalSeqs_df$TotalSeqs);min_seqs_combined
## [1] 2117
min_seqs_DNA <- min(scaled_TotalSeqs_DNA_df$TotalSeqs); min_seqs_DNA
## [1] 2568
min_seqs_RNA <- min(scaled_TotalSeqs_RNA_df$TotalSeqs); min_seqs_RNA
## [1] 2129
max_seqs_combined <- max(scaled_TotalSeqs_df$TotalSeqs);max_seqs_combined
## [1] 2173
max_seqs_DNA <- max(scaled_TotalSeqs_DNA_df$TotalSeqs); max_seqs_DNA
## [1] 2595
max_seqs_RNA <- max(scaled_TotalSeqs_RNA_df$TotalSeqs); max_seqs_RNA
## [1] 2173
#Range 
max_seqs_combined - min_seqs_combined
## [1] 56
max_seqs_DNA - min_seqs_DNA
## [1] 27
max_seqs_RNA - min_seqs_RNA
## [1] 44
# Plot Histogram 
scaled_TotalSeqs_df %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Scaled Sequencing Depth at 2142") + 
  theme_classic()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

scaled_TotalSeqs_DNA_df %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Scaled Sequencing Depth of DNA only samples at 2582") + 
  theme_classic()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

scaled_TotalSeqs_RNA_df %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Scaled Sequencing Depth of RNA only samples at 2142") + 
  theme_classic()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#We can now start running PCoA with respect to Soreson and Bray-Curtis

# Calculate sorensen dissimilarity: Abundance-unweighted of shared taxa
scaled_soren_pcoa_DNA <-  
  ordinate(
    physeq = scaled_rooted_physeq_DNA,
    method = "PCoA",
    distance = "bray", binary = TRUE)

#str(scaled_soren_pcoa_DNA)

# Plot the ordination 
soren_limnion_pcoa_DNA <- plot_ordination(
  physeq = scaled_rooted_physeq_DNA,
  ordination = scaled_soren_pcoa_DNA,
  color = "lakesite",
  shape = "limnion",
  title = "Sorensen PCoA DNA") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = lakesite_colors) + 
  theme_bw()
# Show the plot 
soren_limnion_pcoa_DNA

# Calculate sorensen dissimilarity: Abundance-unweighted of shared taxa
scaled_soren_pcoa_RNA <-  
  ordinate(
    physeq = scaled_rooted_physeq_RNA,
    method = "PCoA",
    distance = "bray", binary = TRUE)

#str(scaled_soren_pcoa_DNA)

# Plot the ordination 
soren_limnion_pcoa_RNA <- plot_ordination(
  physeq = scaled_rooted_physeq_RNA,
  ordination = scaled_soren_pcoa_RNA,
  color = "lakesite",
  shape = "limnion",
  title = "Sorensen PCoA RNA") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = lakesite_colors) + 
  theme_bw()
# Show the plot 
soren_limnion_pcoa_RNA

#Running PCoA with season as the shape 
# Plot the ordination 
soren_season_pcoa_DNA <- plot_ordination(
  physeq = scaled_rooted_physeq_DNA,
  ordination = scaled_soren_pcoa_DNA,
  color = "lakesite",
  shape = "season",
  title = "Sorensen PCoA DNA") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = lakesite_colors) + 
  theme_bw()
# Show the plot 
soren_season_pcoa_DNA

# Plot the ordination 
soren_season_pcoa_RNA <- plot_ordination(
  physeq = scaled_rooted_physeq_RNA,
  ordination = scaled_soren_pcoa_RNA,
  color = "lakesite",
  shape = "season",
  title = "Sorensen PCoA RNA") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = lakesite_colors) + 
  theme_bw()
# Show the plot 
soren_season_pcoa_RNA

#Comeback to combine plots
# Calculate the BC distance
scaled_BC_pcoa_DNA <- 
  ordinate(
    physeq = scaled_rooted_physeq_DNA,
    method = "PCoA",
    distance = "bray")

scaled_BC_pcoa_RNA <- 
  ordinate(
    physeq = scaled_rooted_physeq_RNA,
    method = "PCoA",
    distance = "bray")



# Plot the PCoA
bray_pcoa_limnion_DNA <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_DNA,
    ordination = scaled_BC_pcoa_DNA,
    color = "lakesite",
    shape = "limnion",
    title = "Bray-Curtis PCoA DNA Limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20,21)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

bray_pcoa_limnion_RNA <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_RNA,
    ordination = scaled_BC_pcoa_RNA,
    color = "lakesite",
    shape = "limnion",
    title = "Bray-Curtis PCoA RNA Limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20,21)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

bray_pcoa_season_DNA <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_DNA,
    ordination = scaled_BC_pcoa_DNA,
    color = "lakesite",
    shape= "season",
    title = "Bray-Curtis PCoA DNA Season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20,21)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

bray_pcoa_season_RNA <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_RNA,
    ordination = scaled_BC_pcoa_RNA,
    color = "lakesite",
    shape= "season",
    title = "Bray-Curtis PCoA RNA Season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20,21)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()




grid.arrange(bray_pcoa_limnion_DNA,
bray_pcoa_limnion_RNA,
bray_pcoa_season_DNA,
bray_pcoa_season_RNA, ncol=4) 

Weighted Unifrac

scaled_wUNI_pcoa_combined <- 
  ordinate(
    physeq = scaled_rooted_physeq,
    method = "PCoA",
    distance = "wunifrac")

scaled_wUNI_pcoa_DNA <- 
  ordinate(
    physeq = scaled_rooted_physeq_DNA,
    method = "PCoA",
    distance = "wunifrac")

scaled_wUNI_pcoa_RNA <- 
  ordinate(
    physeq = scaled_rooted_physeq_RNA,
    method = "PCoA",
    distance = "wunifrac")


# Plot the PCoA
wUNI_station_pcoa_combined_season <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_wUNI_pcoa_combined,
    color = "lakesite",
    shape = "season",
    title = "Weighted Unifrac PCoA Combined season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_pcoa_combined_limnion <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_wUNI_pcoa_combined,
    color = "lakesite",
    shape = "limnion",
    title = "Weighted Unifrac PCoA Combined limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()


wUNI_station_pcoa_DNA_season <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_DNA,
    ordination = scaled_wUNI_pcoa_DNA,
    color = "lakesite",
    shape = "season",
    title = "Weighted Unifrac PCoA DNA season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_pcoa_DNA_limnion <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_DNA,
    ordination = scaled_wUNI_pcoa_DNA,
    color = "lakesite",
    shape = "limnion",
    title = "Weighted Unifrac PCoA DNA limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_pcoa_RNA_season <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_RNA,
    ordination = scaled_wUNI_pcoa_RNA,
    color = "lakesite",
    shape = "season",
    title = "Weighted Unifrac PCoA RNA season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_pcoa_RNA_limnion <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_RNA,
    ordination = scaled_wUNI_pcoa_RNA,
    color = "lakesite",
    shape = "limnion",
    title = "Weighted Unifrac PCoA RNA limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_pcoa_combined_season

wUNI_station_pcoa_combined_limnion

wUNI_station_pcoa_DNA_season

wUNI_station_pcoa_DNA_limnion

wUNI_station_pcoa_RNA_season

wUNI_station_pcoa_RNA_limnion

# Calculate the Weighted Unifrac distance
scaled_wUNI_nmds_combined <- 
  ordinate(
    physeq = scaled_rooted_physeq,
    method = "NMDS",
    distance = "wunifrac")
## Run 0 stress 0.1418455 
## Run 1 stress 0.14491 
## Run 2 stress 0.1615657 
## Run 3 stress 0.1562387 
## Run 4 stress 0.166838 
## Run 5 stress 0.1619733 
## Run 6 stress 0.1494274 
## Run 7 stress 0.1477715 
## Run 8 stress 0.1550695 
## Run 9 stress 0.147955 
## Run 10 stress 0.1542195 
## Run 11 stress 0.1449249 
## Run 12 stress 0.1542195 
## Run 13 stress 0.1668388 
## Run 14 stress 0.1449098 
## Run 15 stress 0.1418455 
## ... New best solution
## ... Procrustes: rmse 1.651949e-05  max resid 0.0001469256 
## ... Similar to previous best
## Run 16 stress 0.1634479 
## Run 17 stress 0.1604705 
## Run 18 stress 0.147952 
## Run 19 stress 0.1600244 
## Run 20 stress 0.1418454 
## ... New best solution
## ... Procrustes: rmse 3.563792e-05  max resid 0.0003311476 
## ... Similar to previous best
## *** Best solution repeated 1 times
scaled_wUNI_nmds_DNA <- 
  ordinate(
    physeq = scaled_rooted_physeq_DNA,
    method = "NMDS",
    distance = "wunifrac")
## Run 0 stress 0.1925474 
## Run 1 stress 0.1985646 
## Run 2 stress 0.2141103 
## Run 3 stress 0.1985623 
## Run 4 stress 0.1925474 
## ... Procrustes: rmse 4.622689e-06  max resid 1.70631e-05 
## ... Similar to previous best
## Run 5 stress 0.2515052 
## Run 6 stress 0.1925474 
## ... Procrustes: rmse 1.281947e-05  max resid 8.138397e-05 
## ... Similar to previous best
## Run 7 stress 0.1926273 
## ... Procrustes: rmse 0.004161513  max resid 0.02407351 
## Run 8 stress 0.1925474 
## ... Procrustes: rmse 3.626435e-05  max resid 0.0002383358 
## ... Similar to previous best
## Run 9 stress 0.2102099 
## Run 10 stress 0.1926274 
## ... Procrustes: rmse 0.004159529  max resid 0.02406611 
## Run 11 stress 0.198565 
## Run 12 stress 0.1985652 
## Run 13 stress 0.2101843 
## Run 14 stress 0.1926273 
## ... Procrustes: rmse 0.004166649  max resid 0.02412437 
## Run 15 stress 0.1925474 
## ... Procrustes: rmse 1.676562e-05  max resid 0.0001045908 
## ... Similar to previous best
## Run 16 stress 0.1985644 
## Run 17 stress 0.1985654 
## Run 18 stress 0.1925474 
## ... Procrustes: rmse 6.392559e-06  max resid 2.872485e-05 
## ... Similar to previous best
## Run 19 stress 0.2451172 
## Run 20 stress 0.1925474 
## ... Procrustes: rmse 7.540351e-06  max resid 4.720985e-05 
## ... Similar to previous best
## *** Best solution repeated 6 times
scaled_wUNI_nmds_RNA <- 
  ordinate(
    physeq = scaled_rooted_physeq_RNA,
    method = "NMDS",
    distance = "wunifrac")
## Run 0 stress 0.1645868 
## Run 1 stress 0.2660364 
## Run 2 stress 0.1645868 
## ... New best solution
## ... Procrustes: rmse 4.067466e-06  max resid 2.252153e-05 
## ... Similar to previous best
## Run 3 stress 0.1645868 
## ... Procrustes: rmse 4.083904e-06  max resid 1.905353e-05 
## ... Similar to previous best
## Run 4 stress 0.1645868 
## ... Procrustes: rmse 2.722264e-06  max resid 1.323778e-05 
## ... Similar to previous best
## Run 5 stress 0.1645868 
## ... Procrustes: rmse 1.98128e-06  max resid 1.034146e-05 
## ... Similar to previous best
## Run 6 stress 0.1645868 
## ... Procrustes: rmse 2.849117e-06  max resid 1.05118e-05 
## ... Similar to previous best
## Run 7 stress 0.1645868 
## ... Procrustes: rmse 5.445529e-06  max resid 3.531163e-05 
## ... Similar to previous best
## Run 8 stress 0.1645868 
## ... Procrustes: rmse 2.938561e-06  max resid 1.758468e-05 
## ... Similar to previous best
## Run 9 stress 0.1645868 
## ... Procrustes: rmse 5.152021e-06  max resid 2.938458e-05 
## ... Similar to previous best
## Run 10 stress 0.1645868 
## ... Procrustes: rmse 3.21662e-06  max resid 1.860854e-05 
## ... Similar to previous best
## Run 11 stress 0.2444954 
## Run 12 stress 0.1645868 
## ... Procrustes: rmse 4.164196e-06  max resid 2.150385e-05 
## ... Similar to previous best
## Run 13 stress 0.1645868 
## ... New best solution
## ... Procrustes: rmse 1.422965e-06  max resid 4.641084e-06 
## ... Similar to previous best
## Run 14 stress 0.1645868 
## ... Procrustes: rmse 3.82778e-06  max resid 1.75597e-05 
## ... Similar to previous best
## Run 15 stress 0.2249074 
## Run 16 stress 0.1645868 
## ... Procrustes: rmse 3.358032e-06  max resid 2.109151e-05 
## ... Similar to previous best
## Run 17 stress 0.1645868 
## ... Procrustes: rmse 5.317784e-06  max resid 2.250939e-05 
## ... Similar to previous best
## Run 18 stress 0.1645868 
## ... Procrustes: rmse 1.990947e-06  max resid 7.963673e-06 
## ... Similar to previous best
## Run 19 stress 0.1645868 
## ... Procrustes: rmse 7.68143e-06  max resid 4.757098e-05 
## ... Similar to previous best
## Run 20 stress 0.1645868 
## ... Procrustes: rmse 2.829496e-06  max resid 1.629448e-05 
## ... Similar to previous best
## *** Best solution repeated 7 times
# Plot the PCoA
wUNI_station_nmds_combined_season <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_wUNI_nmds_combined,
    color = "lakesite",
    shape = "season",
    title = "Weighted Unifrac NMDS combined season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_nmds_combined_limnion <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_wUNI_nmds_combined,
    color = "lakesite",
    shape = "limnion",
    title = "Weighted Unifrac NMDS combined limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_nmds_DNA_season <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_DNA,
    ordination = scaled_wUNI_nmds_DNA,
    color = "lakesite",
    shape = "season",
    title = "Weighted Unifrac NMDS DNA season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_nmds_DNA_limnion <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_DNA,
    ordination = scaled_wUNI_nmds_DNA,
    color = "lakesite",
    shape = "limnion",
    title = "Weighted Unifrac NMDS DNA limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_nmds_RNA_season <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_RNA,
    ordination = scaled_wUNI_nmds_RNA,
    color = "lakesite",
    shape = "season",
    title = "Weighted Unifrac NMDS RNA season") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()

wUNI_station_nmds_RNA_limnion <- 
  plot_ordination(
    physeq = scaled_rooted_physeq_RNA,
    ordination = scaled_wUNI_nmds_RNA,
    color = "lakesite",
    shape = "limnion",
    title = "Weighted Unifrac NMDS RNA limnion") +
  geom_point(size=5, alpha = 0.5, aes(color = lakesite)) +
  scale_shape_manual(values = c(15,16,17)) +
  scale_color_manual(values = c(lakesite_colors)) + 
  theme_bw()


wUNI_station_nmds_combined_season

wUNI_station_nmds_combined_limnion

wUNI_station_nmds_DNA_season

wUNI_station_nmds_DNA_limnion

wUNI_station_nmds_RNA_season

wUNI_station_nmds_RNA_limnion

##Statistical Significance Testing

# Calculate all three of the distance matrices

scaled_sorensen_dist_combined <- phyloseq::distance(scaled_rooted_physeq, method = "bray", binary = TRUE)

scaled_sorensen_dist_DNA <- phyloseq::distance(scaled_rooted_physeq_DNA, method = "bray", binary = TRUE)

scaled_sorensen_dist_RNA <- phyloseq::distance(scaled_rooted_physeq_RNA, method = "bray", binary = TRUE)

scaled_bray_dist_combined <- phyloseq::distance(scaled_rooted_physeq, method = "bray")

scaled_bray_dist_DNA <- phyloseq::distance(scaled_rooted_physeq_DNA, method = "bray")

scaled_bray_dist_RNA <- phyloseq::distance(scaled_rooted_physeq_RNA, method = "bray")

scaled_wUnifrac_dist_combined <- phyloseq::distance(scaled_rooted_physeq, method = "wunifrac")

scaled_wUnifrac_dist_DNA <- phyloseq::distance(scaled_rooted_physeq_DNA, method = "wunifrac")

scaled_wUnifrac_dist_RNA <- phyloseq::distance(scaled_rooted_physeq_RNA, method = "wunifrac")

# make a data frame from the sample_data
# All distance matrices will be the same metadata because they 
# originate from the same phyloseq object. 
metadata_combined <- data.frame(sample_data(scaled_rooted_physeq))
metadata_DNA <- data.frame(sample_data(scaled_rooted_physeq_DNA))
metadata_RNA <- data.frame(sample_data(scaled_rooted_physeq_RNA))

# Adonis test
# In this example we are testing the hypothesis that the five stations
# that were collected have different centroids in the ordination space 
# for each of the dissimilarity metrics, we are using a discrete variable 
adonis2(scaled_sorensen_dist_combined ~ lakesite, data = metadata_combined)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_sorensen_dist_combined ~ lakesite, data = metadata_combined)
##           Df SumOfSqs      R2      F Pr(>F)    
## lakesite   3   5.8697 0.23886 10.983  0.001 ***
## Residual 105  18.7043 0.76114                  
## Total    108  24.5740 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_bray_dist_combined ~ lakesite, data = metadata_combined)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_bray_dist_combined ~ lakesite, data = metadata_combined)
##           Df SumOfSqs      R2      F Pr(>F)    
## lakesite   3   4.4486 0.16638 6.9855  0.001 ***
## Residual 105  22.2892 0.83362                  
## Total    108  26.7379 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_wUnifrac_dist_combined ~ lakesite, data = metadata_combined)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_wUnifrac_dist_combined ~ lakesite, data = metadata_combined)
##           Df SumOfSqs      R2      F Pr(>F)   
## lakesite   3   0.5782 0.08178 3.1171  0.002 **
## Residual 105   6.4923 0.91822                 
## Total    108   7.0705 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_sorensen_dist_DNA ~ lakesite, data = metadata_DNA)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_sorensen_dist_DNA ~ lakesite, data = metadata_DNA)
##          Df SumOfSqs      R2      F Pr(>F)    
## lakesite  3   3.2802 0.32063 8.1804  0.001 ***
## Residual 52   6.9503 0.67937                  
## Total    55  10.2305 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_bray_dist_DNA ~ lakesite, data = metadata_DNA)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_bray_dist_DNA ~ lakesite, data = metadata_DNA)
##          Df SumOfSqs      R2      F Pr(>F)    
## lakesite  3   2.5944 0.30663 7.6653  0.001 ***
## Residual 52   5.8667 0.69337                  
## Total    55   8.4611 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_wUnifrac_dist_DNA ~ lakesite, data = metadata_DNA)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_wUnifrac_dist_DNA ~ lakesite, data = metadata_DNA)
##          Df SumOfSqs      R2      F Pr(>F)    
## lakesite  3  0.28642 0.17106 3.5768  0.001 ***
## Residual 52  1.38801 0.82894                  
## Total    55  1.67444 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_sorensen_dist_RNA ~ lakesite, data = metadata_RNA)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_sorensen_dist_RNA ~ lakesite, data = metadata_RNA)
##          Df SumOfSqs      R2      F Pr(>F)    
## lakesite  3   3.1514 0.25969 5.7294  0.001 ***
## Residual 49   8.9840 0.74031                  
## Total    52  12.1354 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_bray_dist_RNA ~ lakesite, data = metadata_RNA)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_bray_dist_RNA ~ lakesite, data = metadata_RNA)
##          Df SumOfSqs      R2      F Pr(>F)    
## lakesite  3   2.7364 0.23186 4.9302  0.001 ***
## Residual 49   9.0656 0.76814                  
## Total    52  11.8020 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_wUnifrac_dist_RNA ~ lakesite, data = metadata_RNA)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_wUnifrac_dist_RNA ~ lakesite, data = metadata_RNA)
##          Df SumOfSqs      R2      F Pr(>F)    
## lakesite  3    0.420 0.18049 3.5973  0.001 ***
## Residual 49    1.907 0.81951                  
## Total    52    2.327 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##BetaDispR

# Homogeneity of Disperson test with beta dispr
# Sorensen 
beta_soren_station_combined <- betadisper(scaled_sorensen_dist_combined, metadata_combined$lakesite)
permutest(beta_soren_station_combined)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)   
## Groups      3 0.14349 0.047831 5.6975    999  0.004 **
## Residuals 105 0.88148 0.008395                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Bray-curtis 
beta_bray_station_combined <- betadisper(scaled_bray_dist_combined, metadata_combined$lakesite)
permutest(beta_bray_station_combined)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.04498 0.014993 1.7103    999  0.153
## Residuals 105 0.92043 0.008766
# Weighted Unifrac 
beta_unifrac_station_combined <- betadisper(scaled_wUnifrac_dist_combined, metadata_combined$lakesite)
permutest(beta_unifrac_station_combined)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##            Df   Sum Sq    Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.001152 0.00038413 0.1343    999  0.928
## Residuals 105 0.300301 0.00286001
#Homogeneity of Dispersion test with beta dispr 
#Soerson DNA
beta_soren_station_DNA <- betadisper(scaled_sorensen_dist_DNA, metadata_DNA$lakesite)
permutest(beta_soren_station_DNA)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)  
## Groups     3 0.06805 0.0226834 2.5481    999  0.056 .
## Residuals 52 0.46291 0.0089021                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bray-curtis DNA
beta_bray_station_DNA <- betadisper(scaled_bray_dist_DNA, metadata_DNA$lakesite)
permutest(beta_bray_station_DNA)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups     3 0.03891 0.0129706 1.6156    999  0.217
## Residuals 52 0.41748 0.0080285
# Weighted Unifrac DNA
beta_bray_unifrac_DNA <- betadisper(scaled_wUnifrac_dist_DNA, metadata_DNA$lakesite)
permutest(beta_bray_unifrac_DNA)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups     3 0.007897 0.0026323 1.1012    999  0.364
## Residuals 52 0.124307 0.0023905
#Homogeneity of Dispersion test with beta dispr 
#Soerson RNA
beta_soren_station_RNA <- betadisper(scaled_sorensen_dist_RNA, metadata_RNA$lakesite)
permutest(beta_soren_station_RNA)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)  
## Groups     3 0.09634 0.032114 3.6571    999   0.02 *
## Residuals 49 0.43028 0.008781                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bray-curtis RNA
beta_bray_station_RNA <- betadisper(scaled_bray_dist_RNA, metadata_RNA$lakesite)
permutest(beta_bray_station_RNA)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)  
## Groups     3 0.07377 0.024588 2.3909    999  0.096 .
## Residuals 49 0.50393 0.010284                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Weighted Unifrac rNA
beta_bray_unifrac_RNA <- betadisper(scaled_wUnifrac_dist_RNA, metadata_RNA$lakesite)
permutest(beta_bray_unifrac_RNA)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df   Sum Sq    Mean Sq      F N.Perm Pr(>F)
## Groups     3 0.002733 0.00091112 0.2955    999  0.835
## Residuals 49 0.151078 0.00308323

Taxonomic Composition

#Phylum

# Set the phylum colors
phylum_colors <- c(
  Acidobacteriota = "navy", 
  Actinobacteriota = "darkslategray2", 
  Armatimonadota = "deeppink1",
  Alphaproteobacteria = "plum2", 
  Bacteroidota = "gold", 
  Betaproteobacteria = "plum1", 
  Bdellovibrionota = "red1",
  Chloroflexi="black", 
  Crenarchaeota = "firebrick",
  Cyanobacteria = "limegreen",
  Deltaproteobacteria = "grey", 
  Desulfobacterota="magenta",
  Firmicutes = "#3E9B96",
  Gammaproteobacteria = "greenyellow",
  "Marinimicrobia (SAR406 clade)" = "yellow",
  Myxococcota = "#B5D6AA",
  Nitrospirota = "palevioletred1",
  Proteobacteria = "royalblue",
  Planctomycetota = "darkorange", 
  "SAR324 clade(Marine group B)" = "olivedrab",
  #Proteobacteria_unclassified = "greenyellow",
  Thermoplasmatota = "green",
  Verrucomicrobiota = "darkorchid1")
 # Other = "grey")
# Calculate the phylum relative abundance 
# Make different dfs for combined, DNA and RNA samples
phylum_df_combined <- 
  scaled_rooted_physeq %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Phylum") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01)

phylum_df_DNA <- 
  scaled_rooted_physeq_DNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Phylum") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01)

phylum_df_RNA <- 
  scaled_rooted_physeq_RNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Phylum") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01)

#Combined Phyla Plots

# Stacked Bar Plot With All phyla 
# Plot Phylum Abundances - make sure to load phylum_colors 
phylum_df_combined %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Bottom") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Bottom Limnion Phylum Composition") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

phylum_df_combined %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Middle") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Middle Limnion Phylum Composition") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

phylum_df_combined %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Top") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Top Limnion Phylum Composition") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

#Phyla plots from DNA samples

phylum_df_DNA %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Top") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Top Limnion Phylum Composition DNA") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

phylum_df_DNA %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Middle") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Middle Limnion Phylum Composition DNA") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

phylum_df_DNA %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Bottom") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Bottom Limnion Phylum Composition DNA") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

#Phyla from RNA samples

phylum_df_RNA <- 
  scaled_rooted_physeq_RNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Phylum") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01)

phylum_df_RNA %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Middle") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Middle Limnion Phylum Composition RNA") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

phylum_df_RNA %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Bottom") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Bottom Limnion Phylum Composition RNA") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

phylum_df_RNA %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(limnion == "Top") %>% 
  ggplot(aes(x = lakesite, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~season, scale = "free") +
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Top Limnion Phylum Composition RNA") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

#Narrowing in on specific phyla

# Narrow in on a specific group
# Actinobacteriota - y: abundance, x: station, dot plot + boxplot

##Family

# Calculate the Family relative abundance for each sample
# # Make separate dfs for combined, DNA and RNA samples
family_df_combined <- 
  scaled_rooted_physeq %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Family") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) 

family_df_DNA <- 
  scaled_rooted_physeq_DNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Family") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) 

family_df_RNA <- 
  scaled_rooted_physeq_RNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Family") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) 
#Judging from the Phylum plots we identified Cyanobacteria,Actinobacteriota and Verrucomicrobiota as the most interesting Phyla to discet further as they seem to be quite abudant and vary between season and limnion.
# Cyanobacteria Families in DNA sample 
# Plot family 
cyano_season_DNA <- family_df_DNA %>%
  dplyr::filter(Phylum == "Cyanobacteria") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~season, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Cyanobacteria Family Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

actin_season_dna <- family_df_DNA %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~season, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Family Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#actin_dna sanity check

verru_season_dna <- family_df_DNA %>%
  dplyr::filter(Phylum == "Verrucomicrobiota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~season, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = " VerrucomicrobiotaFamily Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#verru_dna sanity check

#Now we will check the DNA families faceting by limnion
# Cyanobacteria Families in DNA sample 
# Plot family 
cyano_lim_DNA <- family_df_DNA %>%
  dplyr::filter(Phylum == "Cyanobacteria") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~limnion, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Cyanobacteria Family Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

actin_lim_dna <- family_df_DNA %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~limnion, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Family Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#actin_dna sanity check

verru_limnion_dna <- family_df_DNA %>%
  dplyr::filter(Phylum == "Verrucomicrobiota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~limnion, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = " VerrucomicrobiotaFamily Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#verru_dna sanity check

#RNA Samples Plot

cyano_season_RNA <- family_df_RNA %>%
  dplyr::filter(Phylum == "Cyanobacteria") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~season, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Cyanobacteria RNA Family Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

actin_season_rna <- family_df_RNA %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~season, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Family RNA Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#actin_dna sanity check

verru_season_rna <- family_df_RNA %>%
  dplyr::filter(Phylum == "Verrucomicrobiota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~season, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = " Verrucomicrobiota Family RNA Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#RNA Limnion Samples
cyano_lim_RNA <- family_df_RNA %>%
  dplyr::filter(Phylum == "Cyanobacteria") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~limnion, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Cyanobacteria RNA Family Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

actin_lim_rna <- family_df_RNA %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~limnion, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Family RNA Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")
#actin_dna sanity check

verru_lim_rna <- family_df_RNA %>%
  dplyr::filter(Phylum == "Verrucomicrobiota") %>%
  # build the plot 
  ggplot(aes(x = lakesite, y = Abundance, 
             fill = lakesite, color = lakesite)) + 
  facet_grid(Family~limnion, scale = "free") +
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = " Verrucomicrobiota Family RNA Abundance") + 
  scale_color_manual(values = lakesite_colors) + 
  scale_fill_manual(values = lakesite_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

verru_lim_rna

actin_lim_rna

##Genus

# Calculate the genus relative abundance for each sample
# Make separate dfs for combined, DNA and RNA samples
genus_df <- 
  scaled_rooted_physeq %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Genus") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) 
 

genus_df_DNA <- 
  scaled_rooted_physeq_DNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Genus") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) 
  
genus_df_RNA <- 
  scaled_rooted_physeq_RNA %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Genus") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) 
# Actinobacteriota
# Plot genus 


# Cyanobacteria 
# Plot genus 

##Session Information

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31)
##  os       Rocky Linux 9.0 (Blue Onyx)
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/New_York
##  date     2024-04-30
##  pandoc   3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package          * version    date (UTC) lib source
##  ade4               1.7-22     2023-02-06 [1] CRAN (R 4.3.2)
##  ape                5.7-1      2023-03-13 [2] CRAN (R 4.3.2)
##  Biobase            2.62.0     2023-10-24 [2] Bioconductor
##  BiocGenerics       0.48.1     2023-11-01 [2] Bioconductor
##  biomformat         1.30.0     2023-10-24 [1] Bioconductor
##  Biostrings         2.70.1     2023-10-25 [2] Bioconductor
##  bitops             1.0-7      2021-04-24 [2] CRAN (R 4.3.2)
##  bslib              0.5.1      2023-08-11 [2] CRAN (R 4.3.2)
##  cachem             1.0.8      2023-05-01 [2] CRAN (R 4.3.2)
##  callr              3.7.3      2022-11-02 [2] CRAN (R 4.3.2)
##  cli                3.6.1      2023-03-23 [2] CRAN (R 4.3.2)
##  cluster            2.1.4      2022-08-22 [2] CRAN (R 4.3.2)
##  codetools          0.2-19     2023-02-01 [2] CRAN (R 4.3.2)
##  colorspace         2.1-0      2023-01-23 [2] CRAN (R 4.3.2)
##  crayon             1.5.2      2022-09-29 [2] CRAN (R 4.3.2)
##  data.table         1.14.8     2023-02-17 [2] CRAN (R 4.3.2)
##  devtools         * 2.4.4      2022-07-20 [2] CRAN (R 4.2.1)
##  digest             0.6.33     2023-07-07 [2] CRAN (R 4.3.2)
##  dplyr            * 1.1.3      2023-09-03 [2] CRAN (R 4.3.2)
##  ellipsis           0.3.2      2021-04-29 [2] CRAN (R 4.3.2)
##  evaluate           0.23       2023-11-01 [2] CRAN (R 4.3.2)
##  fansi              1.0.5      2023-10-08 [2] CRAN (R 4.3.2)
##  farver             2.1.1      2022-07-06 [2] CRAN (R 4.3.2)
##  fastmap            1.1.1      2023-02-24 [2] CRAN (R 4.3.2)
##  forcats          * 1.0.0      2023-01-29 [1] CRAN (R 4.3.2)
##  foreach            1.5.2      2022-02-02 [2] CRAN (R 4.3.2)
##  fs                 1.6.3      2023-07-20 [2] CRAN (R 4.3.2)
##  generics           0.1.3      2022-07-05 [2] CRAN (R 4.3.2)
##  GenomeInfoDb       1.38.0     2023-10-24 [2] Bioconductor
##  GenomeInfoDbData   1.2.11     2023-11-07 [2] Bioconductor
##  ggplot2          * 3.5.0      2024-02-23 [2] CRAN (R 4.3.2)
##  glue               1.6.2      2022-02-24 [2] CRAN (R 4.3.2)
##  gridExtra        * 2.3        2017-09-09 [1] CRAN (R 4.3.2)
##  gtable             0.3.4      2023-08-21 [2] CRAN (R 4.3.2)
##  highr              0.10       2022-12-22 [2] CRAN (R 4.3.2)
##  hms                1.1.3      2023-03-21 [1] CRAN (R 4.3.2)
##  htmltools          0.5.7      2023-11-03 [2] CRAN (R 4.3.2)
##  htmlwidgets        1.6.2      2023-03-17 [2] CRAN (R 4.3.2)
##  httpuv             1.6.12     2023-10-23 [2] CRAN (R 4.3.2)
##  igraph             1.5.1      2023-08-10 [2] CRAN (R 4.3.2)
##  IRanges            2.36.0     2023-10-24 [2] Bioconductor
##  iterators          1.0.14     2022-02-05 [2] CRAN (R 4.3.2)
##  jquerylib          0.1.4      2021-04-26 [2] CRAN (R 4.3.2)
##  jsonlite           1.8.7      2023-06-29 [2] CRAN (R 4.3.2)
##  knitr              1.45       2023-10-30 [2] CRAN (R 4.3.2)
##  labeling           0.4.3      2023-08-29 [2] CRAN (R 4.3.2)
##  later              1.3.1      2023-05-02 [2] CRAN (R 4.3.2)
##  lattice          * 0.21-9     2023-10-01 [2] CRAN (R 4.3.2)
##  lifecycle          1.0.3      2022-10-07 [2] CRAN (R 4.3.2)
##  lubridate        * 1.9.3      2023-09-27 [1] CRAN (R 4.3.2)
##  magrittr           2.0.3      2022-03-30 [2] CRAN (R 4.3.2)
##  MASS               7.3-60     2023-05-04 [2] CRAN (R 4.3.2)
##  Matrix             1.6-1.1    2023-09-18 [2] CRAN (R 4.3.2)
##  memoise            2.0.1      2021-11-26 [2] CRAN (R 4.3.2)
##  mgcv               1.9-0      2023-07-11 [2] CRAN (R 4.3.2)
##  mime               0.12       2021-09-28 [2] CRAN (R 4.3.2)
##  miniUI             0.1.1.1    2018-05-18 [2] CRAN (R 4.3.2)
##  multtest           2.58.0     2023-10-24 [1] Bioconductor
##  munsell            0.5.0      2018-06-12 [2] CRAN (R 4.3.2)
##  nlme               3.1-163    2023-08-09 [2] CRAN (R 4.3.2)
##  pacman             0.5.1      2019-03-11 [1] CRAN (R 4.3.2)
##  patchwork        * 1.2.0.9000 2024-04-28 [1] Github (thomasp85/patchwork@d943757)
##  permute          * 0.9-7      2022-01-27 [1] CRAN (R 4.3.2)
##  phyloseq         * 1.46.0     2023-10-24 [1] Bioconductor
##  pillar             1.9.0      2023-03-22 [2] CRAN (R 4.3.2)
##  pkgbuild           1.4.2      2023-06-26 [2] CRAN (R 4.3.2)
##  pkgconfig          2.0.3      2019-09-22 [2] CRAN (R 4.3.2)
##  pkgload            1.3.3      2023-09-22 [2] CRAN (R 4.3.2)
##  plyr               1.8.9      2023-10-02 [2] CRAN (R 4.3.2)
##  prettyunits        1.2.0      2023-09-24 [2] CRAN (R 4.3.2)
##  processx           3.8.2      2023-06-30 [2] CRAN (R 4.3.2)
##  profvis            0.3.8      2023-05-02 [2] CRAN (R 4.3.2)
##  promises           1.2.1      2023-08-10 [2] CRAN (R 4.3.2)
##  ps                 1.7.5      2023-04-18 [2] CRAN (R 4.3.2)
##  purrr            * 1.0.2      2023-08-10 [2] CRAN (R 4.3.2)
##  R6                 2.5.1      2021-08-19 [2] CRAN (R 4.3.2)
##  Rcpp               1.0.11     2023-07-06 [2] CRAN (R 4.3.2)
##  RCurl              1.98-1.13  2023-11-02 [2] CRAN (R 4.3.2)
##  readr            * 2.1.5      2024-01-10 [1] CRAN (R 4.3.2)
##  remotes            2.4.2.1    2023-07-18 [2] CRAN (R 4.3.2)
##  reshape2           1.4.4      2020-04-09 [2] CRAN (R 4.3.2)
##  rhdf5              2.46.1     2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
##  rhdf5filters       1.14.1     2023-11-06 [1] Bioconductor
##  Rhdf5lib           1.24.2     2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
##  rlang              1.1.2      2023-11-04 [2] CRAN (R 4.3.2)
##  rmarkdown          2.25       2023-09-18 [2] CRAN (R 4.3.2)
##  rstudioapi         0.15.0     2023-07-07 [2] CRAN (R 4.3.2)
##  S4Vectors          0.40.1     2023-10-26 [2] Bioconductor
##  sass               0.4.7      2023-07-15 [2] CRAN (R 4.3.2)
##  scales             1.3.0      2023-11-28 [2] CRAN (R 4.3.2)
##  sessioninfo        1.2.2      2021-12-06 [2] CRAN (R 4.3.2)
##  shiny              1.7.5.1    2023-10-14 [2] CRAN (R 4.3.2)
##  stringi            1.7.12     2023-01-11 [2] CRAN (R 4.3.2)
##  stringr          * 1.5.0      2022-12-02 [2] CRAN (R 4.3.2)
##  survival           3.5-7      2023-08-14 [2] CRAN (R 4.3.2)
##  tibble           * 3.2.1      2023-03-20 [2] CRAN (R 4.3.2)
##  tidyr            * 1.3.0      2023-01-24 [2] CRAN (R 4.3.2)
##  tidyselect         1.2.0      2022-10-10 [2] CRAN (R 4.3.2)
##  tidyverse        * 2.0.0      2023-02-22 [1] CRAN (R 4.3.2)
##  timechange         0.3.0      2024-01-18 [1] CRAN (R 4.3.2)
##  tzdb               0.4.0      2023-05-12 [1] CRAN (R 4.3.2)
##  urlchecker         1.0.1      2021-11-30 [2] CRAN (R 4.3.2)
##  usethis          * 2.2.2      2023-07-06 [2] CRAN (R 4.3.2)
##  utf8               1.2.4      2023-10-22 [2] CRAN (R 4.3.2)
##  vctrs              0.6.4      2023-10-12 [2] CRAN (R 4.3.2)
##  vegan            * 2.6-4      2022-10-11 [1] CRAN (R 4.3.2)
##  withr              2.5.2      2023-10-30 [2] CRAN (R 4.3.2)
##  xfun               0.41       2023-11-01 [2] CRAN (R 4.3.2)
##  xtable             1.8-4      2019-04-21 [2] CRAN (R 4.3.2)
##  XVector            0.42.0     2023-10-24 [2] Bioconductor
##  yaml               2.3.7      2023-01-23 [2] CRAN (R 4.3.2)
##  zlibbioc           1.48.0     2023-10-24 [2] Bioconductor
## 
##  [1] /home/dt473/R/x86_64-pc-linux-gnu-library/4.3
##  [2] /programs/R-4.3.2/library
## 
## ──────────────────────────────────────────────────────────────────────────────